library(plyr)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2 ✓ purrr 0.3.4
✓ tibble 3.0.3 ✓ dplyr 1.0.2
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.0
── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::arrange() masks plyr::arrange()
x purrr::compact() masks plyr::compact()
x dplyr::count() masks plyr::count()
x dplyr::failwith() masks plyr::failwith()
x dplyr::filter() masks stats::filter()
x dplyr::id() masks plyr::id()
x dplyr::lag() masks stats::lag()
x dplyr::mutate() masks plyr::mutate()
x dplyr::rename() masks plyr::rename()
x dplyr::summarise() masks plyr::summarise()
x dplyr::summarize() masks plyr::summarize()
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library(plotly)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following objects are masked from ‘package:plyr’:
arrange, mutate, rename, summarise
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
#Import the dataset, random select 50,000 rows
original <- read.csv('https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-06.csv')
original<-drop_na(original)
unique(original$RatecodeID)
[1] 1 2 3 5 4 99 6
original <- subset.data.frame(original, original$RatecodeID != 99, drop = TRUE)
set.seed(100518243)
index <- sample(1:nrow(original),50000)
june2020 <- (original[index,])
attach(june2020)
dim(june2020)
[1] 50000 18
problems(june2020)
Transform the datetime columns from character to datetime data types
june2020$tpep_pickup_datetime <- ymd_hms(june2020$tpep_pickup_datetime)
june2020$tpep_dropoff_datetime <- ymd_hms(june2020$tpep_dropoff_datetime)
Count the number of null value rows for each column
sapply(june2020, function(y) sum(length(which(is.na(y)))))
VendorID tpep_pickup_datetime tpep_dropoff_datetime
0 0 0
passenger_count trip_distance RatecodeID
0 0 0
store_and_fwd_flag PULocationID DOLocationID
0 0 0
payment_type fare_amount extra
0 0 0
mta_tax tip_amount tolls_amount
0 0 0
improvement_surcharge total_amount congestion_surcharge
0 0 0
Convert columns to categorical factors
june2020$store_and_fwd_flag <- parse_factor(june2020$store_and_fwd_flag)
june2020$payment_type <- factor(june2020$payment_type)
june2020$VendorID <- factor(june2020$VendorID)
june2020$payment_type <- mapvalues(payment_type, from = c("1", "2", "3","4","5"), to = c("Credit Card", "Cash","No charge","Dispute","Unknown"))
june2020$VendorID <- mapvalues(VendorID, from = c("1", "2"), to = c("Creative Mobile Technologies", "VeriFone Inc"))
june2020$RatecodeID <- mapvalues(payment_type, from = c("1", "2", "3","4","5"), to = c("Standard Rate", "JFK","Newark","Nassau or Westchester","Negotiated fare"))
june2020$pickup_day <- factor(day(tpep_pickup_datetime))
june2020$pickup_month <- factor(month(tpep_pickup_datetime, label = TRUE))
june2020$pickup_dayofweek <- factor(wday(tpep_pickup_datetime, label = TRUE))
june2020$dropoff_day <- factor(day(tpep_dropoff_datetime))
june2020$dropoff_month <- factor(month(tpep_dropoff_datetime, label = TRUE))
june2020$dropoff_dayofweek <- factor(wday(tpep_dropoff_datetime, label = TRUE))
june2020$pickup_hour <- factor(hour(tpep_pickup_datetime))
june2020$dropoff_hour <- factor(hour(tpep_dropoff_datetime))
str(june2020)
'data.frame': 50000 obs. of 26 variables:
$ VendorID : chr "VeriFone Inc" "Creative Mobile Technologies" "VeriFone Inc" "Creative Mobile Technologies" ...
$ tpep_pickup_datetime : POSIXct, format: "2020-06-14 14:06:41" "2020-06-30 20:03:34" ...
$ tpep_dropoff_datetime: POSIXct, format: "2020-06-14 14:12:58" "2020-06-30 20:04:44" ...
$ passenger_count : int 1 0 1 1 1 1 2 1 4 1 ...
$ trip_distance : num 1.71 0.4 1.82 3.2 3 1.97 1.26 0.87 1.2 1.7 ...
$ RatecodeID : chr "JFK" "JFK" "Standard Rate" "Standard Rate" ...
$ store_and_fwd_flag : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
$ PULocationID : int 140 140 140 249 43 42 236 239 100 74 ...
$ DOLocationID : int 233 263 142 48 161 238 262 143 234 263 ...
$ payment_type : chr "Cash" "Cash" "Credit Card" "Credit Card" ...
$ fare_amount : num 8 3.5 9 11 11.5 9.5 7.5 6 6 6.5 ...
$ extra : num 0 3 0 3.5 0 0 0 0 2.5 2.5 ...
$ mta_tax : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
$ tip_amount : num 0 0 1.5 3.05 3.7 0 0 2 1 0 ...
$ tolls_amount : num 0 0 0 0 0 0 0 0 0 0 ...
$ improvement_surcharge: num 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 ...
$ total_amount : num 11.3 7.3 13.8 18.4 18.5 ...
$ congestion_surcharge : num 2.5 2.5 2.5 2.5 2.5 0 2.5 2.5 2.5 2.5 ...
$ pickup_day : Factor w/ 31 levels "1","2","3","4",..: 14 30 26 9 5 12 29 19 16 1 ...
$ pickup_month : Ord.factor w/ 3 levels "Jan"<"May"<"Jun": 3 3 3 3 3 3 3 3 3 3 ...
$ pickup_dayofweek : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 1 3 6 3 6 6 2 6 3 2 ...
$ dropoff_day : Factor w/ 30 levels "1","2","3","4",..: 14 30 26 9 5 12 29 19 16 1 ...
$ dropoff_month : Ord.factor w/ 3 levels "Jan"<"Jun"<"Jul": 2 2 2 2 2 2 2 2 2 2 ...
$ dropoff_dayofweek : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 1 3 6 3 6 6 2 6 3 2 ...
$ pickup_hour : Factor w/ 24 levels "0","1","2","3",..: 15 21 15 18 16 11 16 15 16 13 ...
$ dropoff_hour : Factor w/ 24 levels "0","1","2","3",..: 15 21 15 18 16 11 16 15 16 13 ...
summary(june2020)
VendorID tpep_pickup_datetime tpep_dropoff_datetime
Length:50000 Min. :2009-01-01 00:02:59 Min. :2009-01-01 00:07:40
Class :character 1st Qu.:2020-06-10 11:55:41 1st Qu.:2020-06-10 12:05:16
Mode :character Median :2020-06-18 11:17:42 Median :2020-06-18 11:28:44
Mean :2020-06-17 12:01:02 Mean :2020-06-17 12:14:35
3rd Qu.:2020-06-24 23:12:52 3rd Qu.:2020-06-24 23:29:09
Max. :2020-06-30 23:55:05 Max. :2020-07-01 13:23:17
passenger_count trip_distance RatecodeID store_and_fwd_flag
Min. :0.000 Min. : 0.000 Length:50000 N:49594
1st Qu.:1.000 1st Qu.: 0.980 Class :character Y: 406
Median :1.000 Median : 1.700 Mode :character
Mean :1.357 Mean : 2.807
3rd Qu.:1.000 3rd Qu.: 3.130
Max. :6.000 Max. :167.500
PULocationID DOLocationID payment_type fare_amount
Min. : 1.0 Min. : 1.0 Length:50000 Min. :-180.09
1st Qu.:107.0 1st Qu.: 90.0 Class :character 1st Qu.: 6.00
Median :161.0 Median :157.0 Mode :character Median : 8.50
Mean :160.5 Mean :155.6 Mean : 11.81
3rd Qu.:234.0 3rd Qu.:234.0 3rd Qu.: 13.50
Max. :265.0 Max. :265.0 Max. : 509.70
extra mta_tax tip_amount tolls_amount
Min. :-4.500 Min. :-0.5000 Min. :-36.30 Min. :-6.1200
1st Qu.: 0.000 1st Qu.: 0.5000 1st Qu.: 0.00 1st Qu.: 0.0000
Median : 0.500 Median : 0.5000 Median : 1.50 Median : 0.0000
Mean : 1.133 Mean : 0.4914 Mean : 1.76 Mean : 0.2174
3rd Qu.: 2.500 3rd Qu.: 0.5000 3rd Qu.: 2.65 3rd Qu.: 0.0000
Max. : 7.000 Max. : 0.5000 Max. :104.56 Max. :69.0000
improvement_surcharge total_amount congestion_surcharge pickup_day
Min. :-0.3000 Min. :-180.39 Min. :-2.500 26 : 2491
1st Qu.: 0.3000 1st Qu.: 10.30 1st Qu.: 2.500 25 : 2380
Median : 0.3000 Median : 13.56 Median : 2.500 24 : 2335
Mean : 0.2966 Mean : 16.98 Mean : 2.108 29 : 2269
3rd Qu.: 0.3000 3rd Qu.: 18.96 3rd Qu.: 2.500 30 : 2261
Max. : 0.3000 Max. : 627.35 Max. : 2.500 23 : 2243
(Other):36021
pickup_month pickup_dayofweek dropoff_day dropoff_month dropoff_dayofweek
Jan: 1 Sun:4475 26 : 2481 Jan: 1 Sun:4496
May: 1 Mon:9018 25 : 2382 Jun:49988 Mon:9010
Jun:49998 Tue:9216 24 : 2336 Jul: 11 Tue:9197
Wed:7097 29 : 2274 Wed:7115
Thu:7192 30 : 2254 Thu:7195
Fri:7759 23 : 2243 Fri:7750
Sat:5243 (Other):36030 Sat:5237
pickup_hour dropoff_hour
14 : 3903 14 : 3855
15 : 3891 15 : 3817
16 : 3725 16 : 3708
13 : 3630 13 : 3636
17 : 3594 17 : 3618
12 : 3498 12 : 3464
(Other):27759 (Other):27902
glimpse(june2020)
Rows: 50,000
Columns: 26
$ VendorID <chr> "VeriFone Inc", "Creative Mobile Technologies", "Ver…
$ tpep_pickup_datetime <dttm> 2020-06-14 14:06:41, 2020-06-30 20:03:34, 2020-06-2…
$ tpep_dropoff_datetime <dttm> 2020-06-14 14:12:58, 2020-06-30 20:04:44, 2020-06-2…
$ passenger_count <int> 1, 0, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 3, 1, 2, 1, 1, 1…
$ trip_distance <dbl> 1.71, 0.40, 1.82, 3.20, 3.00, 1.97, 1.26, 0.87, 1.20…
$ RatecodeID <chr> "JFK", "JFK", "Standard Rate", "Standard Rate", "Sta…
$ store_and_fwd_flag <fct> N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N…
$ PULocationID <int> 140, 140, 140, 249, 43, 42, 236, 239, 100, 74, 186, …
$ DOLocationID <int> 233, 263, 142, 48, 161, 238, 262, 143, 234, 263, 68,…
$ payment_type <chr> "Cash", "Cash", "Credit Card", "Credit Card", "Credi…
$ fare_amount <dbl> 8.0, 3.5, 9.0, 11.0, 11.5, 9.5, 7.5, 6.0, 6.0, 6.5, …
$ extra <dbl> 0.0, 3.0, 0.0, 3.5, 0.0, 0.0, 0.0, 0.0, 2.5, 2.5, 0.…
$ mta_tax <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.…
$ tip_amount <dbl> 0.00, 0.00, 1.50, 3.05, 3.70, 0.00, 0.00, 2.00, 1.00…
$ tolls_amount <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ improvement_surcharge <dbl> 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.…
$ total_amount <dbl> 11.30, 7.30, 13.80, 18.35, 18.50, 10.30, 10.80, 11.3…
$ congestion_surcharge <dbl> 2.5, 2.5, 2.5, 2.5, 2.5, 0.0, 2.5, 2.5, 2.5, 2.5, 2.…
$ pickup_day <fct> 14, 30, 26, 9, 5, 12, 29, 19, 16, 1, 27, 12, 14, 26,…
$ pickup_month <ord> Jun, Jun, Jun, Jun, Jun, Jun, Jun, Jun, Jun, Jun, Ju…
$ pickup_dayofweek <ord> Sun, Tue, Fri, Tue, Fri, Fri, Mon, Fri, Tue, Mon, Sa…
$ dropoff_day <fct> 14, 30, 26, 9, 5, 12, 29, 19, 16, 1, 27, 12, 14, 26,…
$ dropoff_month <ord> Jun, Jun, Jun, Jun, Jun, Jun, Jun, Jun, Jun, Jun, Ju…
$ dropoff_dayofweek <ord> Sun, Tue, Fri, Tue, Fri, Fri, Mon, Fri, Tue, Mon, Sa…
$ pickup_hour <fct> 14, 20, 14, 17, 15, 10, 15, 14, 15, 12, 17, 16, 17, …
$ dropoff_hour <fct> 14, 20, 14, 17, 15, 10, 15, 14, 15, 12, 17, 16, 18, …
head(june2020)
write.csv(june2020,"nyc_data.csv")
ggplotly(ggplot(data = june2020) + geom_bar(mapping = aes(x= passenger_count, fill=passenger_count)))
ggplotly(ggplot(data = june2020) +
geom_bar(mapping = aes(x = VendorID, fill=VendorID)))
ggplotly(ggplot(data = june2020) +
geom_bar(mapping = aes(x = payment_type, fill=payment_type)))
ggplotly(ggplot(data = june2020) +
geom_bar(mapping = aes(x = RatecodeID, fill=RatecodeID)))
par(mfrow=c(2,2))
ggplotly(ggplot(data = june2020) + geom_bar(mapping = aes(x=pickup_dayofweek, fill=pickup_dayofweek)) + ggtitle("Pick Up Days of the week"))
ggplotly(ggplot(data = june2020) + geom_bar(mapping = aes(x=dropoff_dayofweek, fill=dropoff_dayofweek)) + ggtitle("Drop Off Days of the week"))
ggplotly(ggplot(data = june2020) + geom_bar(mapping = aes(x=pickup_hour, fill=pickup_hour)) + ggtitle("Pick Up Hours of the week"))
ggplotly(ggplot(data = june2020) + geom_bar(mapping = aes(x=dropoff_hour, fill=dropoff_hour)) + ggtitle("Drop Off Hours of the week"))
ggplot(june2020, aes(x = pickup_dayofweek, y = total_amount)) +
geom_point()
ggplot(june2020, aes(x=trip_distance, y=total_amount))+geom_point()